{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualizing Part-of-Speech Tagging with Yellowbrick\n",
    "\n",
    "This notebook is a sample of the text visualizations that yellowbrick provides, in particular a feature that enables visual part-of-speech tagging, which can be used to make decisions about text normalization and vectorization."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys \n",
    "\n",
    "# Modify the path \n",
    "sys.path.append(\"..\")\n",
    "\n",
    "import yellowbrick as yb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First let's grab a few documents to experiment with"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "texts = {\n",
    "    'nursery_rhyme' : '''Baa, baa, black sheep,\n",
    "                        Have you any wool?\n",
    "                        Yes, sir, yes, sir,\n",
    "                        Three bags full;\n",
    "                        One for the master,\n",
    "                        And one for the dame,\n",
    "                        And one for the little boy\n",
    "                        Who lives down the lane.''',\n",
    "    'algebra'       : '''Algebra (from Arabic \"al-jabr\" meaning \n",
    "                        \"reunion of broken parts\") is one of the \n",
    "                        broad parts of mathematics, together with \n",
    "                        number theory, geometry and analysis. In \n",
    "                        its most general form, algebra is the study \n",
    "                        of mathematical symbols and the rules for \n",
    "                        manipulating these symbols; it is a unifying \n",
    "                        thread of almost all of mathematics.''',\n",
    "    'french_silk'   : '''In a small saucepan, combine sugar and eggs \n",
    "                        until well blended. Cook over low heat, stirring \n",
    "                        constantly, until mixture reaches 160° and coats \n",
    "                        the back of a metal spoon. Remove from the heat. \n",
    "                        Stir in chocolate and vanilla until smooth. Cool \n",
    "                        to lukewarm (90°), stirring occasionally. In a small \n",
    "                        bowl, cream butter until light and fluffy. Add cooled\n",
    "                        chocolate mixture; beat on high speed for 5 minutes \n",
    "                        or until light and fluffy. In another large bowl, \n",
    "                        beat cream until it begins to thicken. Add \n",
    "                        confectioners' sugar; beat until stiff peaks form. \n",
    "                        Fold into chocolate mixture. Pour into crust. Chill \n",
    "                        for at least 6 hours before serving. Garnish with \n",
    "                        whipped cream and chocolate curls if desired. '''\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "##########################################################################\n",
    "# Imports\n",
    "##########################################################################\n",
    "\n",
    "from yellowbrick.text.base import TextVisualizer\n",
    "\n",
    "##########################################################################\n",
    "# PosTagVisualizer\n",
    "##########################################################################\n",
    "\n",
    "class PosTagVisualizer(TextVisualizer):\n",
    "    \"\"\"\n",
    "    A part-of-speech tag visualizer colorizes text to enable\n",
    "    the user to visualize the proportions of nouns, verbs, etc.\n",
    "    and to use this information to make decisions about text\n",
    "    normalization (e.g. stemming vs lemmatization) and\n",
    "    vectorization.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    kwargs : dict\n",
    "        Pass any additional keyword arguments to the super class.\n",
    "    cmap : dict\n",
    "        ANSII colormap\n",
    "\n",
    "    These parameters can be influenced later on in the visualization\n",
    "    process, but can and should be set as early as possible.\n",
    "    \"\"\"\n",
    "    def __init__(self, ax=None, **kwargs):\n",
    "        \"\"\"\n",
    "        Initializes the base frequency distributions with many\n",
    "        of the options required in order to make this\n",
    "        visualization work.\n",
    "        \"\"\"\n",
    "        super(PosTagVisualizer, self).__init__(ax=ax, **kwargs)\n",
    "\n",
    "        # TODO: hard-coding in the ANSII colormap for now.\n",
    "        # Can we let the user reset the colors here?\n",
    "        self.COLORS = {\n",
    "            'white'      : \"\\033[0;37m{}\\033[0m\",\n",
    "            'yellow'     : \"\\033[0;33m{}\\033[0m\",\n",
    "            'green'      : \"\\033[0;32m{}\\033[0m\",\n",
    "            'blue'       : \"\\033[0;34m{}\\033[0m\",\n",
    "            'cyan'       : \"\\033[0;36m{}\\033[0m\",\n",
    "            'red'        : \"\\033[0;31m{}\\033[0m\",\n",
    "            'magenta'    : \"\\033[0;35m{}\\033[0m\",\n",
    "            'black'      : \"\\033[0;30m{}\\033[0m\",\n",
    "            'darkwhite'  : \"\\033[1;37m{}\\033[0m\",\n",
    "            'darkyellow' : \"\\033[1;33m{}\\033[0m\",\n",
    "            'darkgreen'  : \"\\033[1;32m{}\\033[0m\",\n",
    "            'darkblue'   : \"\\033[1;34m{}\\033[0m\",\n",
    "            'darkcyan'   : \"\\033[1;36m{}\\033[0m\",\n",
    "            'darkred'    : \"\\033[1;31m{}\\033[0m\",\n",
    "            'darkmagenta': \"\\033[1;35m{}\\033[0m\",\n",
    "            'darkblack'  : \"\\033[1;30m{}\\033[0m\",\n",
    "             None        : \"\\033[0;0m{}\\033[0m\"\n",
    "        }\n",
    "\n",
    "        self.TAGS = {\n",
    "            'NN'   : 'green',\n",
    "            'NNS'  : 'green',\n",
    "            'NNP'  : 'green',\n",
    "            'NNPS' : 'green',\n",
    "            'VB'   : 'blue',\n",
    "            'VBD'  : 'blue',\n",
    "            'VBG'  : 'blue',\n",
    "            'VBN'  : 'blue',\n",
    "            'VBP'  : 'blue',\n",
    "            'VBZ'  : 'blue',\n",
    "            'JJ'   : 'red',\n",
    "            'JJR'  : 'red',\n",
    "            'JJS'  : 'red',\n",
    "            'RB'   : 'cyan',\n",
    "            'RBR'  : 'cyan',\n",
    "            'RBS'  : 'cyan',\n",
    "            'IN'   : 'darkwhite',\n",
    "            'POS'  : 'darkyellow',\n",
    "            'PRP$' : 'magenta',\n",
    "            'PRP$' : 'magenta',\n",
    "            'DT'   : 'black',\n",
    "            'CC'   : 'black',\n",
    "            'CD'   : 'black',\n",
    "            'WDT'  : 'black',\n",
    "            'WP'   : 'black',\n",
    "            'WP$'  : 'black',\n",
    "            'WRB'  : 'black',\n",
    "            'EX'   : 'yellow',\n",
    "            'FW'   : 'yellow',\n",
    "            'LS'   : 'yellow',\n",
    "            'MD'   : 'yellow',\n",
    "            'PDT'  : 'yellow',\n",
    "            'RP'   : 'yellow',\n",
    "            'SYM'  : 'yellow',\n",
    "            'TO'   : 'yellow',\n",
    "            'None' : 'off'\n",
    "        }\n",
    "\n",
    "    def colorize(self, token, color):\n",
    "        \"\"\"\n",
    "        Colorize text\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        token : str\n",
    "            A str representation of\n",
    "\n",
    "        \"\"\"\n",
    "        return self.COLORS[color].format(token)\n",
    "\n",
    "    def transform(self, tagged_tuples):\n",
    "        \"\"\"\n",
    "        The transform method transforms the raw text input for the\n",
    "        part-of-speech tagging visualization. It requires that\n",
    "        documents be in the form of (tag, token) tuples.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        tagged_token_tuples : list of tuples\n",
    "            A list of (tag, token) tuples\n",
    "\n",
    "        Text documents must be tokenized and tagged before passing to fit()\n",
    "        \"\"\"\n",
    "        self.tagged = [\n",
    "            (self.TAGS.get(tag),tok) for tok, tag in tagged_tuples\n",
    "        ]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import wordnet as wn\n",
    "from nltk import pos_tag, word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[0;32mBaa\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;32mbaa\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;31mblack\u001b[0m \u001b[0;32msheep\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;34mHave\u001b[0m \u001b[0;0myou\u001b[0m \u001b[0;30many\u001b[0m \u001b[0;32mwool\u001b[0m \u001b[0;0m?\u001b[0m \u001b[0;0mYes\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;31msir\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;0myes\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;32msir\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;32mThree\u001b[0m \u001b[0;34mbags\u001b[0m \u001b[0;31mfull\u001b[0m \u001b[0;0m;\u001b[0m \u001b[0;30mOne\u001b[0m \u001b[1;37mfor\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;32mmaster\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;30mAnd\u001b[0m \u001b[0;30mone\u001b[0m \u001b[1;37mfor\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;32mdame\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;30mAnd\u001b[0m \u001b[0;30mone\u001b[0m \u001b[1;37mfor\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;31mlittle\u001b[0m \u001b[0;32mboy\u001b[0m \u001b[0;30mWho\u001b[0m \u001b[0;34mlives\u001b[0m \u001b[0;33mdown\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;32mlane\u001b[0m \u001b[0;0m.\u001b[0m\n",
      "\n",
      "\n",
      "\u001b[0;32mAlgebra\u001b[0m \u001b[0;0m(\u001b[0m \u001b[1;37mfrom\u001b[0m \u001b[0;32mArabic\u001b[0m \u001b[0;0m``\u001b[0m \u001b[0;31mal-jabr\u001b[0m \u001b[0;0m''\u001b[0m \u001b[0;34mmeaning\u001b[0m \u001b[0;0m``\u001b[0m \u001b[0;32mreunion\u001b[0m \u001b[1;37mof\u001b[0m \u001b[0;31mbroken\u001b[0m \u001b[0;32mparts\u001b[0m \u001b[0;0m''\u001b[0m \u001b[0;0m)\u001b[0m \u001b[0;34mis\u001b[0m \u001b[0;30mone\u001b[0m \u001b[1;37mof\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;31mbroad\u001b[0m \u001b[0;32mparts\u001b[0m \u001b[1;37mof\u001b[0m \u001b[0;32mmathematics\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;36mtogether\u001b[0m \u001b[1;37mwith\u001b[0m \u001b[0;32mnumber\u001b[0m \u001b[0;32mtheory\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;32mgeometry\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;32manalysis\u001b[0m \u001b[0;0m.\u001b[0m \u001b[1;37mIn\u001b[0m \u001b[0;35mits\u001b[0m \u001b[0;36mmost\u001b[0m \u001b[0;31mgeneral\u001b[0m \u001b[0;32mform\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;32malgebra\u001b[0m \u001b[0;34mis\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;32mstudy\u001b[0m \u001b[1;37mof\u001b[0m \u001b[0;31mmathematical\u001b[0m \u001b[0;32msymbols\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;32mrules\u001b[0m \u001b[1;37mfor\u001b[0m \u001b[0;34mmanipulating\u001b[0m \u001b[0;30mthese\u001b[0m \u001b[0;32msymbols\u001b[0m \u001b[0;0m;\u001b[0m \u001b[0;0mit\u001b[0m \u001b[0;34mis\u001b[0m \u001b[0;30ma\u001b[0m \u001b[0;31munifying\u001b[0m \u001b[0;32mthread\u001b[0m \u001b[1;37mof\u001b[0m \u001b[0;36malmost\u001b[0m \u001b[0;30mall\u001b[0m \u001b[1;37mof\u001b[0m \u001b[0;32mmathematics\u001b[0m \u001b[0;0m.\u001b[0m\n",
      "\n",
      "\n",
      "\u001b[1;37mIn\u001b[0m \u001b[0;30ma\u001b[0m \u001b[0;31msmall\u001b[0m \u001b[0;32msaucepan\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;32mcombine\u001b[0m \u001b[0;32msugar\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;32meggs\u001b[0m \u001b[1;37muntil\u001b[0m \u001b[0;36mwell\u001b[0m \u001b[0;34mblended\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mCook\u001b[0m \u001b[1;37mover\u001b[0m \u001b[0;31mlow\u001b[0m \u001b[0;32mheat\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;34mstirring\u001b[0m \u001b[0;36mconstantly\u001b[0m \u001b[0;0m,\u001b[0m \u001b[1;37muntil\u001b[0m \u001b[0;32mmixture\u001b[0m \u001b[0;34mreaches\u001b[0m \u001b[0;30m160°\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;34mcoats\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;32mback\u001b[0m \u001b[1;37mof\u001b[0m \u001b[0;30ma\u001b[0m \u001b[0;32mmetal\u001b[0m \u001b[0;32mspoon\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;34mRemove\u001b[0m \u001b[1;37mfrom\u001b[0m \u001b[0;30mthe\u001b[0m \u001b[0;32mheat\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mStir\u001b[0m \u001b[1;37min\u001b[0m \u001b[0;32mchocolate\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;32mvanilla\u001b[0m \u001b[1;37muntil\u001b[0m \u001b[0;31msmooth\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mCool\u001b[0m \u001b[0;33mto\u001b[0m \u001b[0;34mlukewarm\u001b[0m \u001b[0;0m(\u001b[0m \u001b[0;30m90°\u001b[0m \u001b[0;0m)\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;34mstirring\u001b[0m \u001b[0;36moccasionally\u001b[0m \u001b[0;0m.\u001b[0m \u001b[1;37mIn\u001b[0m \u001b[0;30ma\u001b[0m \u001b[0;31msmall\u001b[0m \u001b[0;32mbowl\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;32mcream\u001b[0m \u001b[0;32mbutter\u001b[0m \u001b[1;37muntil\u001b[0m \u001b[0;31mlight\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;31mfluffy\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mAdd\u001b[0m \u001b[0;34mcooled\u001b[0m \u001b[0;31mchocolate\u001b[0m \u001b[0;32mmixture\u001b[0m \u001b[0;0m;\u001b[0m \u001b[0;32mbeat\u001b[0m \u001b[1;37mon\u001b[0m \u001b[0;31mhigh\u001b[0m \u001b[0;32mspeed\u001b[0m \u001b[1;37mfor\u001b[0m \u001b[0;30m5\u001b[0m \u001b[0;32mminutes\u001b[0m \u001b[0;30mor\u001b[0m \u001b[1;37muntil\u001b[0m \u001b[0;31mlight\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;31mfluffy\u001b[0m \u001b[0;0m.\u001b[0m \u001b[1;37mIn\u001b[0m \u001b[0;30manother\u001b[0m \u001b[0;31mlarge\u001b[0m \u001b[0;32mbowl\u001b[0m \u001b[0;0m,\u001b[0m \u001b[0;31mbeat\u001b[0m \u001b[0;32mcream\u001b[0m \u001b[1;37muntil\u001b[0m \u001b[0;0mit\u001b[0m \u001b[0;34mbegins\u001b[0m \u001b[0;33mto\u001b[0m \u001b[0;34mthicken\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mAdd\u001b[0m \u001b[0;32mconfectioners\u001b[0m \u001b[1;33m'\u001b[0m \u001b[0;32msugar\u001b[0m \u001b[0;0m;\u001b[0m \u001b[0;30mbeat\u001b[0m \u001b[1;37muntil\u001b[0m \u001b[0;31mstiff\u001b[0m \u001b[0;32mpeaks\u001b[0m \u001b[0;32mform\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mFold\u001b[0m \u001b[1;37minto\u001b[0m \u001b[0;31mchocolate\u001b[0m \u001b[0;32mmixture\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mPour\u001b[0m \u001b[1;37minto\u001b[0m \u001b[0;32mcrust\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;32mChill\u001b[0m \u001b[1;37mfor\u001b[0m \u001b[1;37mat\u001b[0m \u001b[0;31mleast\u001b[0m \u001b[0;30m6\u001b[0m \u001b[0;32mhours\u001b[0m \u001b[1;37mbefore\u001b[0m \u001b[0;34mserving\u001b[0m \u001b[0;0m.\u001b[0m \u001b[0;31mGarnish\u001b[0m \u001b[1;37mwith\u001b[0m \u001b[0;31mwhipped\u001b[0m \u001b[0;32mcream\u001b[0m \u001b[0;30mand\u001b[0m \u001b[0;32mchocolate\u001b[0m \u001b[0;32mcurls\u001b[0m \u001b[1;37mif\u001b[0m \u001b[0;34mdesired\u001b[0m \u001b[0;0m.\u001b[0m\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Tokenize the text\n",
    "for label,text in texts.items():\n",
    "    tokens = word_tokenize(text)\n",
    "    tagged = pos_tag(tokens)\n",
    "    \n",
    "    \n",
    "    visualizer = PosTagVisualizer()\n",
    "    visualizer.transform(tagged)\n",
    "    \n",
    "    \n",
    "    print(' '.join((visualizer.colorize(token, color) for color, token in visualizer.tagged)))\n",
    "    print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
